import os
from epyfun.fs import convert_to_utf8
# Example usage:
#file_path = './data/pooltemp/Outdoor_log_from_20230701_to_20230727.csv'
#encoding = convert_to_utf8(file_path, "interim/pooltemp")
#print(f"The file encoding was: {encoding}")
input_directory = './data/pooltemp'
for filename in os.listdir(input_directory):
if filename.endswith('.csv'):
input_file = os.path.join(input_directory, filename)
convert_to_utf8(input_file, 'interim/pooltemp/' + filename)Pool
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False| Config | value |
|---|---|
| feedback | True |
| autopandas | True |
| displaycon | False |
| dsn_filename | ./connections.ini |
%sql duckdb:///:default:
# %sql duckdb:///:memory:
# %sql duckdb:///path/to/file.dbIngest
%%sql
CREATE OR REPLACE VIEW pooltemp AS
SELECT *
FROM read_csv(
'interim/pooltemp/*.csv',
columns = {'time': 'TIMESTAMP', 'temp': 'DOUBLE'},
decimal_separator = ',',
delim = '\t',
filename = True,
header = True,
skip = 1
)
;| Success |
|---|
#%sqlcmd profile -t pooltemp;
#this does not seem to be working, it does not work on saved queries,
#nor if I create the view, nor if I create the table%%sql
CREATE OR REPLACE MACRO add_dups_count(_srctbl, _cols) AS TABLE
(SELECT *, COUNT(*) OVER (PARTITION BY _cols) AS _cnt
FROM pooltemp)
--TODO: figure out how to pass a table name to macros in DuckDB
--FROM _srctbl
;| Success |
|---|
%%sql
SELECT * FROM add_dups_count(pooltemp, time);| time | temp | filename | _cnt | |
|---|---|---|---|---|
| 0 | 2023-06-14 20:55:00 | 25.69 | interim/pooltemp/Outdoor_log_from_20220101_to_... | 1 |
| 1 | 2023-06-14 20:58:40 | 25.73 | interim/pooltemp/Outdoor_log_from_20220101_to_... | 1 |
| 2 | 2023-06-14 20:59:00 | 25.73 | interim/pooltemp/Outdoor_log_from_20220101_to_... | 1 |
| 3 | 2023-06-14 21:02:10 | 25.73 | interim/pooltemp/Outdoor_log_from_20220101_to_... | 1 |
| 4 | 2023-06-14 21:07:30 | 25.73 | interim/pooltemp/Outdoor_log_from_20220101_to_... | 1 |
| ... | ... | ... | ... | ... |
| 296575 | 2023-08-12 20:30:50 | 26.08 | interim/pooltemp/Outdoor_log_from_20230701_to_... | 1 |
| 296576 | 2023-08-12 20:31:40 | 26.08 | interim/pooltemp/Outdoor_log_from_20230701_to_... | 1 |
| 296577 | 2023-08-12 20:38:20 | 26.08 | interim/pooltemp/Outdoor_log_from_20230701_to_... | 1 |
| 296578 | 2023-08-12 20:40:50 | 26.08 | interim/pooltemp/Outdoor_log_from_20230701_to_... | 1 |
| 296579 | 2023-08-12 20:41:00 | 26.08 | interim/pooltemp/Outdoor_log_from_20230701_to_... | 1 |
296580 rows × 4 columns
%%sql
CREATE OR REPLACE MACRO count_dups_by(_srctbl, _cols) AS TABLE
SELECT
COUNT(*) AS _tot,
COUNT(*) FILTER(WHERE _cnt > 1) AS _dups,
COUNT(DISTINCT _cols) AS _uniq,
_dups - (_tot - _uniq) AS _duniq,
_dups / _duniq AS _puniq
FROM add_dups_count(_srctbl, _cols)
;| Success |
|---|
%%sql
SELECT * FROM count_dups_by(pooltemp, (time, temp));| _tot | _dups | _uniq | _duniq | _puniq | |
|---|---|---|---|---|---|
| 0 | 296580 | 42787 | 270593 | 16800 | 2.546845 |
%%sql --save pooltemp_clean
pooltemp_clean =<< SELECT time, avg(temp) AS temp
FROM pooltemp
GROUP BY time
ORDER BY time
;| time | temp | |
|---|---|---|
| 0 | 2023-06-14 20:52:20 | 25.63 |
| 1 | 2023-06-14 20:52:30 | 25.63 |
| 2 | 2023-06-14 20:52:40 | 25.63 |
| 3 | 2023-06-14 20:52:50 | 25.63 |
| 4 | 2023-06-14 20:53:00 | 25.63 |
| ... | ... | ... |
| 267979 | 2023-08-12 20:50:10 | 26.08 |
| 267980 | 2023-08-12 20:50:20 | 26.08 |
| 267981 | 2023-08-12 20:50:30 | 26.08 |
| 267982 | 2023-08-12 20:50:40 | 26.08 |
| 267983 | 2023-08-12 20:50:50 | 26.08 |
267984 rows × 2 columns
pooltemp_clean| time | temp | |
|---|---|---|
| 0 | 2023-06-14 20:52:20 | 25.63 |
| 1 | 2023-06-14 20:52:30 | 25.63 |
| 2 | 2023-06-14 20:52:40 | 25.63 |
| 3 | 2023-06-14 20:52:50 | 25.63 |
| 4 | 2023-06-14 20:53:00 | 25.63 |
| ... | ... | ... |
| 267979 | 2023-08-12 20:50:10 | 26.08 |
| 267980 | 2023-08-12 20:50:20 | 26.08 |
| 267981 | 2023-08-12 20:50:30 | 26.08 |
| 267982 | 2023-08-12 20:50:40 | 26.08 |
| 267983 | 2023-08-12 20:50:50 | 26.08 |
267984 rows × 2 columns
import plotly.express as px
import pandas as pd
import numpy as np
fig = px.scatter(pooltemp_clean, x="time", y="temp", render_mode='webgl')
fig.update_traces(marker_line=dict(width=1, color='DarkSlateGray'))
fig.show()%%timeit %%sql SELECT COUNT(DISTINCT (time, temp)), COUNT(*) FROM pooltemp ;
%%timeit %%sql SELECT COUNT() FROM ( SELECT time, COUNT() FROM pooltemp GROUP BY time HAVING COUNT(*) > 1
) ;
%%timeit %%sql –save duplicates_detail
SELECT FROM ( SELECT , COUNT() OVER (PARTITION BY time) AS cnt COUNT() OVER (PARTITION BY time, temp) AS cnt2 FROM pooltemp ) WHERE cnt > 1 – AND cnt <> cnt2 ORDER BY -cnt, time ;
%%sql
SELECT COUNT(*), COUNT(DISTINCT time),
COUNT(DISTINCT (time, temp))
FROM duplicates_detail ;
%%timeit %%sql
SELECT time, avg(temp) AS temp FROM pooltemp GROUP BY time ORDER BY time
%%sql
CREATE OR REPLACE MACRO count_uniq_by(cols) AS TABLE SELECT COUNT(DISTINCT cols) FROM pooltemp